### Quantitative data analysis for the ict project
### ZhiMin Xiao 2014
### Many thanks to Professor Gary King, Solé Prillaman, and Stephen Pettigrew for your feedback on my homework assignments and the quantitative strand of the analysis for this paper.

#rm(list = ls()); ls()
setwd("/Users/XZM/Dropbox/wangba/R"); load("ict.RData"); ls()

# Check for packages and install them if not already installed
required.packages <- c("plm", "matrixStats", "ellipse", "vcd", "foreign", "MatchIt", "cem", "car", "aod", "survey", "texreg", "ggplot2", "lattice", "mvtnorm", "Zelig", "mice", "ZeligChoice", "Amelia", "xtable", "maps", "mapdata", "maptools", "scales"); new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])]; if(length(new.packages)) install.packages(new.packages); invisible(lapply(required.packages, function(x) {library(x,character.only=TRUE)}))


###### logit model to predict academic achievement

# make academic ranking a binary variable called achieve, average or below as 0; above average or above as 1; prefer not to say and NA as NA
phd$achieve <- NA; phd$achieve[phd$rank %in% c(1,2,3)] <- 0; phd$achieve[phd$rank %in% c(4,5)] <- 1; phd$achieve; sum(phd$achieve==1,na.rm=T); sum(is.na(phd$achieve)) # 94 missing values, 249 in total reported achievement

# make viscafe (level of exposure to wangba) a binary variable 
phd$T <- NA; phd$T[phd$viscafe %in% c(0)] <- 1; phd$T[phd$viscafe %in% c(1,2,3,4)] <- 0; phd$T; sum(is.na(phd$T)); sum(phd$T==1,na.rm=T) # 10 missing values, in total, 206 are treated
# phd$viscafe # 0 never before, 1 prefer not to say, 2 weekends or holidays, 3, often, even during term time, 4 can't cope without it


# create a dataset for matching
select.m <- c("achieve", "T", "diqu", "htdwell", "pe", "icts", "acts", "zdpt", "wenli"); select.m %in% colnames(phd); data.lw <- na.omit(phd[select.m]); nrow(data.lw) # 508 observations

# the log-likelihood function
ll.logit <- function(beta, X, y){
-sum(log(1+exp((1-2*y)*X%*%beta)))	
}

y <- as.matrix(data.lw[,"achieve"]); X <- as.matrix(cbind(1, data.lw[,c("T", "diqu", "htdwell", "pe", "icts", "acts", "zdpt", "wenli")])); head(X)

opt.t1m1 <- optim(par=rep(0,9), y=y, X=X, fn=ll.logit, method="BFGS", control=list(fnscale=-1), hessian=T); opt.t1m1; t1m1.coef <- opt.t1m1$par; head(X); t1m1.coef; t1m1.ses <- sqrt(diag(solve(-opt.t1m1$hessian))); t1m1.ses

achieve <- y==1; nrow(X[achieve,]); sum(achieve); base.line <- c(apply(X[achieve,], 2, median)); base.line

#=====
base.line[3] <- 1; base.line # school Basum

library(MASS); set.seed(1405973792); betas <- mvrnorm(n=50000, mu=t1m1.coef, Sigma=solve(-opt.t1m1$hessian)); t <- c <- base.line; c["T"] <- 0; t["T"] <- 1; c; t

first.diffs <- 1/(1+exp(-t%*%t(betas))) - 1/(1+exp(-c%*%t(betas))); mean(first.diffs); quantile(first.diffs, c(.025, .975))  

#=====
base.line[3] <- 2; base.line # school Hengshan

library(MASS); set.seed(1405973792); betas <- mvrnorm(n=50000, mu=t1m1.coef, Sigma=solve(-opt.t1m1$hessian)); t <- c <- base.line; c["T"] <- 0; t["T"] <- 1; c; t

first.diffs <- 1/(1+exp(-t%*%t(betas))) - 1/(1+exp(-c%*%t(betas))); mean(first.diffs); quantile(first.diffs, c(.025, .975))   

#=====
base.line[3] <- 3; base.line # school Nanshan

library(MASS); set.seed(1405973792); betas <- mvrnorm(n=50000, mu=t1m1.coef, Sigma=solve(-opt.t1m1$hessian)); t <- c <- base.line; c["T"] <- 0; t["T"] <- 1; c; t

first.diffs <- 1/(1+exp(-t%*%t(betas))) - 1/(1+exp(-c%*%t(betas))); mean(first.diffs); quantile(first.diffs, c(.025, .975)) 

par(mfrow=c(1,3)); plot(density(first.diffs), col="red", main="Basum"); abline(v=0, col="steelblue", lty=2); plot(density(first.diffs), col="red", main="Hengshan"); abline(v=0, col="steelblue", lty=2); plot(density(first.diffs), col="red", main="Nanshan"); abline(v=0, col="steelblue", lty=2)

#### baseline imbalance

bl.imb <- imbalance(data.lw$T, data.lw, drop=c("achieve", "T")); bl.imb

m.t <- matchit(formula = T ~ diqu + htdwell + pe + acts, data=data.lw, method="cem", eval.imbalance=T); summary(m.t); mt.data <- match.data(m.t); head(mt.data); dim(mt.data)

plot(m.t, type="QQ", which.xs=c("htdwell", "acts", "pe"), col="red")
m.t$nn[2,]
plot(m.t, type="jitter", col="steelblue", interactive=F)
plot(m.t, type="hist", border=c("cornflowerblue"))

fit <- zelig(achieve ~ T + diqu + pe + htdwell + acts, mt.data, model="logit"); summary(fit)

x.out0 <- setx(fit, T=0, diqu=1); x.out1 <- setx(fit, T=1, diqu=1); set.seed(1405973792); s.out <- sim(fit, x=x.out0, x1=x.out1); summary(s.out); plot(s.out)

x.out0 <- setx(fit, T=0, diqu=2); x.out1 <- setx(fit, T=1, diqu=2); set.seed(1405973792); s.out <- sim(fit, x=x.out0, x1=x.out1); summary(s.out); plot(s.out)

x.out0 <- setx(fit, T=0, diqu=3); x.out1 <- setx(fit, T=1, diqu=3); set.seed(1405973792); s.out <- sim(fit, x=x.out0, x1=x.out1); summary(s.out); plot(s.out, )

### using Amelia to impute missing values

select.m <- c("achieve", "T", "diqu", "htdwell", "icts", "pe", "acts"); select.m %in% colnames(phd); data.m <- phd[select.m]; dim(data.m); n <- dim(data.m)[1]; k <- dim(data.m)[2]; data.mi <- data.m; idx <- sample(1:n, .3*n); sum(is.na(data.m$achieve)) # 94 missing values

set.seed(1405973792); invisible(sapply(idx, function(x) data.mi[x, sample(2:k,1)]<<-NA)); set.seed(1405973792); imputed <- amelia(data.mi, 10, p2s=0, noms=c("T", "achieve")); mi.dsets <- imputed$imputations[1:10]; summary(mi.dsets)

mat <- cem("T", datalist=mi.dsets, drop=c("achieve")); mat; att(mat, achieve ~ T, data=mi.dsets)

